In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #importing all necessary Libraries
life_expectancy = pd.read_csv('/content/Life Expectancy Data.csv') #Reading the data set
life_expectancy.head() #gives top 5 rows with atrributes
life_expectancy.dropna(inplace=True) # drop rows with missing values
In [ ]:
life_expectancy.dtypes
Out[ ]:
Country                             object
Year                                 int64
Status                              object
Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
percentage expenditure             float64
Hepatitis B                        float64
Measles                              int64
 BMI                               float64
under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
 HIV/AIDS                          float64
GDP                                float64
Population                         float64
 thinness  1-19 years              float64
 thinness 5-9 years                float64
Income composition of resources    float64
Schooling                          float64
dtype: object
In [ ]:
life_expectancy.describe() #statistical summary of the arributes with numerical values
Out[ ]:
Year Life expectancy Adult Mortality infant deaths Alcohol percentage expenditure Hepatitis B Measles BMI under-five deaths Polio Total expenditure Diphtheria HIV/AIDS GDP Population thinness 1-19 years thinness 5-9 years Income composition of resources Schooling
count 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1649.000000 1.649000e+03 1649.000000 1649.000000 1649.000000 1649.000000
mean 2007.840509 69.302304 168.215282 32.553062 4.533196 698.973558 79.217708 2224.494239 38.128623 44.220133 83.564585 5.955925 84.155246 1.983869 5566.031887 1.465363e+07 4.850637 4.907762 0.631551 12.119891
std 4.087711 8.796834 125.310417 120.847190 4.029189 1759.229336 25.604664 10085.802019 19.754249 162.897999 22.450557 2.299385 21.579193 6.032360 11475.900117 7.046039e+07 4.599228 4.653757 0.183089 2.795388
min 2000.000000 44.000000 1.000000 0.000000 0.010000 0.000000 2.000000 0.000000 2.000000 0.000000 3.000000 0.740000 2.000000 0.100000 1.681350 3.400000e+01 0.100000 0.100000 0.000000 4.200000
25% 2005.000000 64.400000 77.000000 1.000000 0.810000 37.438577 74.000000 0.000000 19.500000 1.000000 81.000000 4.410000 82.000000 0.100000 462.149650 1.918970e+05 1.600000 1.700000 0.509000 10.300000
50% 2008.000000 71.700000 148.000000 3.000000 3.790000 145.102253 89.000000 15.000000 43.700000 4.000000 93.000000 5.840000 92.000000 0.100000 1592.572182 1.419631e+06 3.000000 3.200000 0.673000 12.300000
75% 2011.000000 75.000000 227.000000 22.000000 7.340000 509.389994 96.000000 373.000000 55.800000 29.000000 97.000000 7.470000 97.000000 0.700000 4718.512910 7.658972e+06 7.100000 7.100000 0.751000 14.000000
max 2015.000000 89.000000 723.000000 1600.000000 17.870000 18961.348600 99.000000 131441.000000 77.100000 2100.000000 99.000000 14.390000 99.000000 50.600000 119172.741800 1.293859e+09 27.200000 28.200000 0.936000 20.700000
In [48]:
life_expectancy.drop(['Country','Status'],inplace=True,axis=1) #not useful for the linear regression model.
life_expectancy
Out[48]:
Year Life expectancy Adult Mortality infant deaths Alcohol percentage expenditure Hepatitis B Measles BMI under-five deaths Polio Total expenditure Diphtheria HIV/AIDS GDP Population thinness 1-19 years thinness 5-9 years Income composition of resources Schooling
0 2015 65.0 263.0 62 0.01 71.279624 65.0 1154 19.1 83 6.0 8.16 65.0 0.1 584.259210 33736494.0 17.2 17.3 0.479 10.1
1 2014 59.9 271.0 64 0.01 73.523582 62.0 492 18.6 86 58.0 8.18 62.0 0.1 612.696514 327582.0 17.5 17.5 0.476 10.0
2 2013 59.9 268.0 66 0.01 73.219243 64.0 430 18.1 89 62.0 8.13 64.0 0.1 631.744976 31731688.0 17.7 17.7 0.470 9.9
3 2012 59.5 272.0 69 0.01 78.184215 67.0 2787 17.6 93 67.0 8.52 67.0 0.1 669.959000 3696958.0 17.9 18.0 0.463 9.8
4 2011 59.2 275.0 71 0.01 7.097109 68.0 3013 17.2 97 68.0 7.87 68.0 0.1 63.537231 2978599.0 18.2 18.2 0.454 9.5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2933 2004 44.3 723.0 27 4.36 0.000000 68.0 31 27.1 42 67.0 7.13 65.0 33.6 454.366654 12777511.0 9.4 9.4 0.407 9.2
2934 2003 44.5 715.0 26 4.06 0.000000 7.0 998 26.7 41 7.0 6.52 68.0 36.7 453.351155 12633897.0 9.8 9.9 0.418 9.5
2935 2002 44.8 73.0 25 4.43 0.000000 73.0 304 26.3 40 73.0 6.53 71.0 39.8 57.348340 125525.0 1.2 1.3 0.427 10.0
2936 2001 45.3 686.0 25 1.72 0.000000 76.0 529 25.9 39 76.0 6.16 75.0 42.1 548.587312 12366165.0 1.6 1.7 0.427 9.8
2937 2000 46.0 665.0 24 1.68 0.000000 79.0 1483 25.5 39 78.0 7.10 78.0 43.5 547.358878 12222251.0 11.0 11.2 0.434 9.8

1649 rows × 20 columns

In [ ]:
def make_corr_heatmap(df): #function that plots correlation heatmap
  corr = df.corr()
  fig, ax = plt.subplots(figsize=(12,9))
  sns.heatmap(corr, annot=True)
  ax.set_xticklabels(ax.get_xticklabels())
  plt.show()

make_corr_heatmap(life_expectancy)
sns.pairplot(life_expectancy,diag_kind='kde') #displays pair plot with kernel density
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7e0c2d4d65f0>
In [49]:
#year,infant_deaths,measles,population,total_expenditure
print(life_expectancy.columns)
life_expectancy.drop(['Year','infant deaths','Measles ','Total expenditure','Population'],inplace=True,axis=1) #less correlation with life expectancy: redundant attritubes
#life_expectancy.isnull().count()
Index(['Year', 'Life expectancy ', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')
In [50]:
X=life_expectancy.drop('Life expectancy ',axis=1)
Y=life_expectancy['Life expectancy ']
print(X,Y)

#split the dataset into training and testing sets to then train a linear regression model using scikit-learn.
      Adult Mortality  Alcohol  percentage expenditure  Hepatitis B   BMI   \
0               263.0     0.01               71.279624         65.0   19.1   
1               271.0     0.01               73.523582         62.0   18.6   
2               268.0     0.01               73.219243         64.0   18.1   
3               272.0     0.01               78.184215         67.0   17.6   
4               275.0     0.01                7.097109         68.0   17.2   
...               ...      ...                     ...          ...    ...   
2933            723.0     4.36                0.000000         68.0   27.1   
2934            715.0     4.06                0.000000          7.0   26.7   
2935             73.0     4.43                0.000000         73.0   26.3   
2936            686.0     1.72                0.000000         76.0   25.9   
2937            665.0     1.68                0.000000         79.0   25.5   

      under-five deaths   Polio  Diphtheria    HIV/AIDS         GDP  \
0                     83    6.0         65.0        0.1  584.259210   
1                     86   58.0         62.0        0.1  612.696514   
2                     89   62.0         64.0        0.1  631.744976   
3                     93   67.0         67.0        0.1  669.959000   
4                     97   68.0         68.0        0.1   63.537231   
...                  ...    ...          ...        ...         ...   
2933                  42   67.0         65.0       33.6  454.366654   
2934                  41    7.0         68.0       36.7  453.351155   
2935                  40   73.0         71.0       39.8   57.348340   
2936                  39   76.0         75.0       42.1  548.587312   
2937                  39   78.0         78.0       43.5  547.358878   

       thinness  1-19 years   thinness 5-9 years  \
0                      17.2                 17.3   
1                      17.5                 17.5   
2                      17.7                 17.7   
3                      17.9                 18.0   
4                      18.2                 18.2   
...                     ...                  ...   
2933                    9.4                  9.4   
2934                    9.8                  9.9   
2935                    1.2                  1.3   
2936                    1.6                  1.7   
2937                   11.0                 11.2   

      Income composition of resources  Schooling  
0                               0.479       10.1  
1                               0.476       10.0  
2                               0.470        9.9  
3                               0.463        9.8  
4                               0.454        9.5  
...                               ...        ...  
2933                            0.407        9.2  
2934                            0.418        9.5  
2935                            0.427       10.0  
2936                            0.427        9.8  
2937                            0.434        9.8  

[1649 rows x 14 columns] 0       65.0
1       59.9
2       59.9
3       59.5
4       59.2
        ... 
2933    44.3
2934    44.5
2935    44.8
2936    45.3
2937    46.0
Name: Life expectancy , Length: 1649, dtype: float64
In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train,X_test,Y_train,Y_test = train_test_split(X.values,Y.values,test_size=0.2) #dataset is split in the 4:1 ratio for training and testing
In [52]:
model=LinearRegression()
model.fit(X_train,Y_train) #(training) fiting the data to the dataset
print(model.score(X_train,Y_train)) #r2 score
print(model.score(X_test,Y_test)) #print the R2 score for both the training and testing sets to evaluate the model's performance.
0.8289517081052937
0.812440597177055
In [53]:
life_expectancy_coeff = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
life_expectancy_coeff #Understanding the coefficients helps to interpret the impact of each feature on the predicted outcome.
Out[53]:
Coefficient
Adult Mortality -0.018759
Alcohol -0.139051
percentage expenditure 0.000403
Hepatitis B -0.003710
BMI 0.033776
under-five deaths -0.001319
Polio 0.008982
Diphtheria 0.016367
HIV/AIDS -0.414523
GDP 0.000018
thinness 1-19 years 0.035633
thinness 5-9 years -0.068861
Income composition of resources 10.633458
Schooling 0.960030
In [54]:
model.intercept_
Out[54]:
52.25524864107931
In [55]:
predictions = model.predict(X_test) #make predictions on the test set
In [56]:
plt.scatter(Y_test,predictions) # visualize the predicted values against the actual values using a scatter plot
Out[56]:
<matplotlib.collections.PathCollection at 0x7e0c1bba6920>
In [57]:
coefficients = pd.DataFrame(data=model.coef_.reshape(-1,1),index=X.columns,columns=["coefficients"]) #create a DataFrame to show the coefficients for each feature
print(coefficients)
print("Intercept:", model.intercept_) #print the intercept
print("Function: \n", "y = " + " +\n".join([f"({val:.2f})*({X.columns[ind]})" for ind,val in enumerate(model.coef_)]) + f" +\n({model.intercept_})") #print the linear regression equation
                                 coefficients
Adult Mortality                     -0.018759
Alcohol                             -0.139051
percentage expenditure               0.000403
Hepatitis B                         -0.003710
 BMI                                 0.033776
under-five deaths                   -0.001319
Polio                                0.008982
Diphtheria                           0.016367
 HIV/AIDS                           -0.414523
GDP                                  0.000018
 thinness  1-19 years                0.035633
 thinness 5-9 years                 -0.068861
Income composition of resources     10.633458
Schooling                            0.960030
Intercept: 52.25524864107931
Function: 
 y = (-0.02)*(Adult Mortality) +
(-0.14)*(Alcohol) +
(0.00)*(percentage expenditure) +
(-0.00)*(Hepatitis B) +
(0.03)*( BMI ) +
(-0.00)*(under-five deaths ) +
(0.01)*(Polio) +
(0.02)*(Diphtheria ) +
(-0.41)*( HIV/AIDS) +
(0.00)*(GDP) +
(0.04)*( thinness  1-19 years) +
(-0.07)*( thinness 5-9 years) +
(10.63)*(Income composition of resources) +
(0.96)*(Schooling) +
(52.25524864107931)
In [109]:
# MAE MSE RMSE
from sklearn import metrics
print('MAE1:', metrics.mean_absolute_error(Y_test, predictions))
print('MSE1:', metrics.mean_squared_error(Y_test, predictions))
print('RMSE1:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
MAE1: 3.0756719082362913
MSE1: 16.037107579841344
RMSE1: 4.004635761194936
In [58]:
import statsmodels.api as sm
from scipy import stats
In [82]:
X_ = sm.add_constant(X_train)
est = sm.OLS(Y_train, X_)
est2 = est.fit()
print(est2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.829
Model:                            OLS   Adj. R-squared:                  0.827
Method:                 Least Squares   F-statistic:                     451.4
Date:                Sun, 21 Jan 2024   Prob (F-statistic):               0.00
Time:                        18:12:45   Log-Likelihood:                -3556.8
No. Observations:                1319   AIC:                             7144.
Df Residuals:                    1304   BIC:                             7221.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         52.2552      0.785     66.602      0.000      50.716      53.794
x1            -0.0188      0.001    -17.405      0.000      -0.021      -0.017
x2            -0.1391      0.034     -4.145      0.000      -0.205      -0.073
x3             0.0004      0.000      1.985      0.047    4.65e-06       0.001
x4            -0.0037      0.005     -0.726      0.468      -0.014       0.006
x5             0.0338      0.007      5.051      0.000       0.021       0.047
x6            -0.0013      0.001     -1.883      0.060      -0.003     5.5e-05
x7             0.0090      0.006      1.525      0.127      -0.003       0.021
x8             0.0164      0.007      2.444      0.015       0.003       0.030
x9            -0.4145      0.021    -20.020      0.000      -0.455      -0.374
x10         1.827e-05   3.22e-05      0.568      0.570   -4.49e-05    8.14e-05
x11            0.0356      0.058      0.609      0.542      -0.079       0.150
x12           -0.0689      0.058     -1.197      0.232      -0.182       0.044
x13           10.6335      0.957     11.113      0.000       8.756      12.511
x14            0.9600      0.067     14.318      0.000       0.828       1.092
==============================================================================
Omnibus:                       40.658   Durbin-Watson:                   2.006
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               66.118
Skew:                          -0.263   Prob(JB):                     4.39e-15
Kurtosis:                       3.963   Cond. No.                     1.22e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.22e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [83]:
col_mask = (est2.pvalues[1:])<0.05
In [84]:
X.columns[col_mask]
Out[84]:
Index(['Adult Mortality', 'Alcohol', 'percentage expenditure', ' BMI ',
       'Diphtheria ', ' HIV/AIDS', 'Income composition of resources',
       'Schooling'],
      dtype='object')
In [85]:
col_mask2 = np.logical_not(col_mask)
X.columns[col_mask2]
Out[85]:
Index(['Hepatitis B', 'under-five deaths ', 'Polio', 'GDP',
       ' thinness  1-19 years', ' thinness 5-9 years'],
      dtype='object')
In [87]:
#life_expectancy.drop(['Polio','Hepatitis B','GDP',' thinness  1-19 years','under-five deaths ',' thinness 5-9 years'],axis=1,inplace=True)
life_expectancy.head(1)
Out[87]:
Life expectancy Adult Mortality Alcohol percentage expenditure BMI Diphtheria HIV/AIDS Income composition of resources Schooling
0 65.0 263.0 0.01 71.279624 19.1 65.0 0.1 0.479 10.1
In [88]:
X2=life_expectancy.drop('Life expectancy ',axis=1)
Y2=life_expectancy['Life expectancy ']
print(X2,Y2)
      Adult Mortality  Alcohol  percentage expenditure   BMI   Diphtheria   \
0               263.0     0.01               71.279624   19.1         65.0   
1               271.0     0.01               73.523582   18.6         62.0   
2               268.0     0.01               73.219243   18.1         64.0   
3               272.0     0.01               78.184215   17.6         67.0   
4               275.0     0.01                7.097109   17.2         68.0   
...               ...      ...                     ...    ...          ...   
2933            723.0     4.36                0.000000   27.1         65.0   
2934            715.0     4.06                0.000000   26.7         68.0   
2935             73.0     4.43                0.000000   26.3         71.0   
2936            686.0     1.72                0.000000   25.9         75.0   
2937            665.0     1.68                0.000000   25.5         78.0   

       HIV/AIDS  Income composition of resources  Schooling  
0           0.1                            0.479       10.1  
1           0.1                            0.476       10.0  
2           0.1                            0.470        9.9  
3           0.1                            0.463        9.8  
4           0.1                            0.454        9.5  
...         ...                              ...        ...  
2933       33.6                            0.407        9.2  
2934       36.7                            0.418        9.5  
2935       39.8                            0.427       10.0  
2936       42.1                            0.427        9.8  
2937       43.5                            0.434        9.8  

[1649 rows x 8 columns] 0       65.0
1       59.9
2       59.9
3       59.5
4       59.2
        ... 
2933    44.3
2934    44.5
2935    44.8
2936    45.3
2937    46.0
Name: Life expectancy , Length: 1649, dtype: float64
In [89]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X2_train,X2_test,Y2_train,Y2_test = train_test_split(X2.values,Y2.values,test_size=0.2)
In [90]:
model2=LinearRegression()
model2.fit(X2_train,Y2_train)
print(model.score(X2_train,Y2_train)) #r2 score
print(model.score(X2_test,Y2_test))
0.8243791898208145
0.8209650654574213
In [91]:
life_expectancy_coeff2 = pd.DataFrame(model2.coef_,X2.columns,columns=['Coefficient'])
life_expectancy_coeff2
Out[91]:
Coefficient
Adult Mortality -0.018239
Alcohol -0.094395
percentage expenditure 0.000447
BMI 0.040110
Diphtheria 0.020884
HIV/AIDS -0.476124
Income composition of resources 9.661914
Schooling 1.012723
In [92]:
model2.intercept_
Out[92]:
51.68852172103817
In [93]:
predictions2 = model2.predict(X2_test)
In [94]:
plt.scatter(Y2_test,predictions2)
Out[94]:
<matplotlib.collections.PathCollection at 0x7e0c15a03af0>
In [95]:
coefficients2 = pd.DataFrame(data=model2.coef_.reshape(-1,1),index=X2.columns,columns=["coefficients"]) #create a DataFrame to show the coefficients for each feature
print(coefficients2)
print("Intercept:", model2.intercept_) #print the intercept
print("Function: \n", "y = " + " +\n".join([f"({val:.2f})*({X2.columns[ind]})" for ind,val in enumerate(model2.coef_)]) + f" +\n({model2.intercept_})")
                                 coefficients
Adult Mortality                     -0.018239
Alcohol                             -0.094395
percentage expenditure               0.000447
 BMI                                 0.040110
Diphtheria                           0.020884
 HIV/AIDS                           -0.476124
Income composition of resources      9.661914
Schooling                            1.012723
Intercept: 51.68852172103817
Function: 
 y = (-0.02)*(Adult Mortality) +
(-0.09)*(Alcohol) +
(0.00)*(percentage expenditure) +
(0.04)*( BMI ) +
(0.02)*(Diphtheria ) +
(-0.48)*( HIV/AIDS) +
(9.66)*(Income composition of resources) +
(1.01)*(Schooling) +
(51.68852172103817)
In [104]:
X_2 = sm.add_constant(X2_train)
est2_2 = sm.OLS(Y2_train, X_2)
est22 = est2_2.fit()
print(est22.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.826
Model:                            OLS   Adj. R-squared:                  0.825
Method:                 Least Squares   F-statistic:                     776.7
Date:                Sun, 21 Jan 2024   Prob (F-statistic):               0.00
Time:                        18:17:55   Log-Likelihood:                -3585.5
No. Observations:                1319   AIC:                             7189.
Df Residuals:                    1310   BIC:                             7236.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         51.6885      0.687     75.234      0.000      50.341      53.036
x1            -0.0182      0.001    -16.895      0.000      -0.020      -0.016
x2            -0.0944      0.034     -2.808      0.005      -0.160      -0.028
x3             0.0004   6.46e-05      6.916      0.000       0.000       0.001
x4             0.0401      0.006      6.519      0.000       0.028       0.052
x5             0.0209      0.005      4.305      0.000       0.011       0.030
x6            -0.4761      0.023    -21.136      0.000      -0.520      -0.432
x7             9.6619      0.916     10.550      0.000       7.865      11.459
x8             1.0127      0.066     15.373      0.000       0.883       1.142
==============================================================================
Omnibus:                       37.110   Durbin-Watson:                   2.029
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               62.228
Skew:                          -0.230   Prob(JB):                     3.07e-14
Kurtosis:                       3.960   Cond. No.                     1.76e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.76e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [111]:
col_mask2 = (est22.pvalues[1:])<0.05
print(col_mask2)
[ True  True  True  True  True  True  True  True]
In [112]:
print('MAE2:', metrics.mean_absolute_error(Y2_test, predictions2))
print('MSE2:', metrics.mean_squared_error(Y2_test, predictions2))
print('RMSE2:', np.sqrt(metrics.mean_squared_error(Y2_test, predictions2)))
MAE1: 2.8738818423360657
MSE1: 14.660168731913682
RMSE1: 3.828859978102318
In [120]:
D={
    'Model name':['Model 1','Model 2'],
    'Independent Variables': ['Adult Mortality, Alcohol, percentage expenditure, Hepatitis B, BMI, under-five deaths, Polio, Diphtheria, HIV/AIDS, GDP, thinness  1-19 years, thinness 5-9 years, Income composition of resources, Schooling','Adult Mortality, Alcohol, percentage expenditure, BMI, Diphtheria, HIV/AIDS, Income composition of resources, Schooling'],
    'MAE':['3.0756719082362913','2.8738818423360657'],
    'MSE':['16.037107579841344','14.660168731913682'],
    'RMSE':['4.004635761194936','3.828859978102318'],
    'R_SQ':['0.812440597177055','0.8209650654574213']
}
In [121]:
DF = pd.DataFrame(data=D)
DF
Out[121]:
Model name Independent Variables MAE MSE RMSE R_SQ
0 Model 1 Adult Mortality, Alcohol, percentage expenditu... 3.0756719082362913 16.037107579841344 4.004635761194936 0.812440597177055
1 Model 2 Adult Mortality, Alcohol, percentage expenditu... 2.8738818423360657 14.660168731913682 3.828859978102318 0.8209650654574213

Life Expectancy Prediction: Multiple Linear Regression Model¶

Vrushali Kadam - 22BLC1300

Objective:

The objective of this analysis is to build and evaluate a linear regression model for predicting life expectancy based on a given dataset. This involves comprehensive data preprocessing, exploratory data analysis (EDA), and the application of a linear regression model with subsequent evaluation.

Data Loading and Exploration:

We initiated the analysis by loading the life expectancy dataset using Pandas:

import pandas as pd

life_expectancy = pd.read_csv('/content/Life Expectancy Data.csv') life_expectancy.dropna(inplace=True)

Subsequently, we inspected the dataset by examining data types, and providing a statistical summary:

life_expectancy.dtypes life_expectancy.describe()

Data Preprocessing:

To streamline the dataset for linear regression modeling, we removed irrelevant columns ('Country' and 'Status'):

life_expectancy.drop(['Country', 'Status'], inplace=True, axis=1)

Exploratory Data Analysis (EDA):

EDA was performed to better understand relationships within the data. We created a correlation heatmap and a pair plot:

import matplotlib.pyplot as plt import seaborn as sns

def make_corr_heatmap(df): corr = df.corr() fig, ax = plt.subplots(figsize=(12, 9)) sns.heatmap(corr, annot=True) ax.set_xticklabels(ax.get_xticklabels()) plt.show()

make_corr_heatmap(life_expectancy) sns.pairplot(life_expectancy, diag_kind='kde')

Methodology:

Linear Regression Modeling:

The dataset was split into training and testing sets, and a linear regression model was trained using scikit-learn:

from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression

X = life_expectancy.drop('Life expectancy ', axis=1) Y = life_expectancy['Life expectancy ']

X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y.values, test_size=0.2) model = LinearRegression() model.fit(X_train, Y_train)

Model Evaluation and Coefficients:

The model's performance was evaluated using R2 score,MAE,MSE,RMSE. Additionally, we examined the coefficients and intercept of the linear regression model:

For Model 1:

print(model.score(X_train, Y_train)) print(model.score(X_test, Y_test))

life_expectancycoeff = pd.DataFrame(model.coef, X.columns, columns=['Coefficient']) print(life_expectancy_coeff)

from sklearn import metrics print('MAE1:', metrics.mean_absolute_error(Y_test, predictions)) print('MSE1:', metrics.mean_squared_error(Y_test, predictions)) print('RMSE1:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))

Similarly is done for the Model 2 as well

Results:

We limited the number of models to two due to the significant p-values obtained for all variables in the second model. Model 2 emerges as the optimal choice among various models, providing insights into the impact of relevant independent variables on life expectancy. It serves as the most accurate representation of the data, offering the optimal equation for Multiple Linear Regression.

In [124]:
#RESULTS

DF
Out[124]:
Model name Independent Variables MAE MSE RMSE R_SQ
0 Model 1 Adult Mortality, Alcohol, percentage expenditu... 3.0756719082362913 16.037107579841344 4.004635761194936 0.812440597177055
1 Model 2 Adult Mortality, Alcohol, percentage expenditu... 2.8738818423360657 14.660168731913682 3.828859978102318 0.8209650654574213
In [122]:
%%shell
jupyter nbconvert --to html /content/MultipleRegression.ipynb
[NbConvertApp] Converting notebook /content/MultipleRegression.ipynb to html
[NbConvertApp] Writing 8036868 bytes to /content/MultipleRegression.html
Out[122]: